Let us set some global options for all code chunks in this document.

knitr::opts_chunk$set(
  message = FALSE,    # Disable messages printed by R code chunks
  warning = FALSE,    # Disable warnings printed by R code chunks
  echo = TRUE,        # Show R code within code chunks in output
  include = TRUE,     # Include both R code and its results in output
  eval = TRUE,       # Evaluate R code chunks
  cache = FALSE,       # Enable caching of R code chunks for faster rendering
  fig.align = "center",
  out.width = "100%",
  retina = 2,
  error = TRUE,
  collapse = TRUE
)
rm(list = ls())
set.seed(1982)

1 PREPROCESSING

Let us now load some required libraries.

# Load required libraries

# inla.upgrade(testing = TRUE)
# remotes::install_github("inlabru-org/inlabru", ref = "devel")
# remotes::install_github("davidbolin/rspde", ref = "devel")
# remotes::install_github("davidbolin/metricgraph", ref = "devel")
# remotes::install_github("davidbolin/ngme2", ref = "devel")

library(INLA)
inla.setOption(num.threads = 6)
library(inlabru)
library(rSPDE)
library(MetricGraph)
library(ngme2)

library(plotly)
library(dplyr)

library(sf)

library(here)

Function standarize() below is later used to standardize the covariate SpeedLimit.

standardize <- function(x) {return((x - mean(x)) / sd(x))}

1.1 PREPARING AND ADDING THE DATA TO THE GRAPH.

We load the graph object sf_graph (which only contains weights) and the data (already graph-processed).

timeallprocedure <- Sys.time()
load(here("Graph_objects/graph_construction_19MAY24_FRC0134.RData"))
load(here("Data_files/data_day7142128_hour13_with_no_consecutive_zeros_19MAY24_FRC0134_graph_processed.RData"))
data_on_graph = data_on_graph %>% 
  dplyr::select(-datetime)

The following commands remove zero speed observations that are 1m away from the graph, and after that, they remove any speed observations that are 3m away from the graph.

to_remove = data_on_graph %>%
  filter(speed == 0, .distance_to_graph > 0.001) 

data_on_graph = setdiff(data_on_graph, to_remove) %>% 
  filter(.distance_to_graph <= 0.003)

We add data to the graph.

sf_graph$add_observations(data = data_on_graph, 
                          group = "day", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

We get the values of the weights at data locations. This essentially gives us covariates from the weights.

sf_graph$edgeweight_to_data(data_loc = TRUE)

When running sf_graph$edgeweight_to_data(data_loc = TRUE), some NA values are created (because the data is grouped). We remove them below. We also standardize the SpeedLimit covariate.

data <- sf_graph$get_data() %>% 
  drop_na(-StreetName) %>% # this drops all rows with at least one NA value but without taking into account StreetName
  mutate(across(c("SpeedLimit"), ~standardize(.))) %>%
  dplyr::select(speed, SpeedLimit)

1.2 NOW WE ADD THE FINAL VERSION OF THE DATA TO THE GRAPH

We add the data again but now with the new standardized SpeedLimit covariate.

sf_graph$add_observations(data = data, 
                          group = "day", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

1.3 GET THE OBSERVATION LOCATIONS (FROM THE (FINAL VERSION OF THE) DATA EXTRACTED FROM THE GRAPH)

data_final <- sf_graph$get_data()
obs.loc <- data_final |> as.data.frame() |> dplyr::select(.edge_number, .distance_on_edge)

1.4 BUILD THE MESH

h = 0.05
sf_graph$build_mesh(h = h)

1.5 GET THE VALUES OF THE COVARIATE ON THE MESH LOCATIONS

We get the value of the weights at mesh locations. This will allow us to built matrices B.sigma and B.range below. Again, sf_graph$edgeweight_to_data(mesh = TRUE, add = FALSE, return = TRUE) creates repeated information (because the data is grouped). We fix that by filtering one group. We also standardize the SpeedLimit covariate.

mesh <- sf_graph$edgeweight_to_data(mesh = TRUE, 
                                   add = FALSE, 
                                   return = TRUE) %>% 
  filter(.group == 1) %>%
  mutate(across(c("SpeedLimit"), ~standardize(.))) %>%
  dplyr:::select.data.frame(SpeedLimit)

1.6 GET THE PROJECTION MATRIX (FROM THE MESH TO THE OBSERVATION LOCATIONS)

#mesh.loc <- sf_graph$mesh$VtE
A.proj <- sf_graph$fem_basis(obs.loc)

1.7 SIMULATE THE NONSTATIONARY FIELD

nonstat.time.ini <- Sys.time()

sigma <- 1.3
range <- 0.15
nu <- 0.5
sigma.e <- 0.1
rspde.order <- 1

B.sigma = cbind(0, 1, 0, mesh$SpeedLimit, 0)
B.range = cbind(0, 0, 1, 0, mesh$SpeedLimit)

cov_nonstat <- rSPDE::spde.matern.operators(graph = sf_graph,
                                                parameterization = "matern",
                                                B.sigma = B.sigma,
                                                B.range = B.range,
                                                theta = c(sigma, range, sigma, range),
                                                nu = nu)$covariance_mesh()
L <- chol(cov_nonstat)
u_nonstat <- t(L)%*%rnorm(dim(cov_nonstat)[1])

1.8 SIMULATE THE NONSTATIONARY RESPONSE

speed_sim_nonstat <- 0 + 0.95*data_final$SpeedLimit + as.vector(A.proj %*% u_nonstat + sigma.e * rnorm(nrow(data_final)))

1.9 CREATE A NEW DATASET (THE ONLY THING THAT CHANGES IS THE SIMULATED NONSTATIONARY RESPONSE)

data_sim_nonstat <- data_final %>% 
  mutate(speed = speed_sim_nonstat)

1.10 ADD THE DATA WITH SIMULATED RESPONSE (USING NONSTATIONARY FIELD)

sf_graph$add_observations(data = data_sim_nonstat, 
                          group = ".group", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

2 STATIONARY MODEL

2.1 FIT THE STATIONARY MODEL

stat.time.ini <- Sys.time()

rspde_model_stat <- rspde.metric_graph(sf_graph,
                                         parameterization = "matern",
                                         nu = nu)

data_rspde_bru_stat <- graph_data_rspde(rspde_model_stat,
                                        repl = ".all",
                                        loc_name = "loc")
cmp_stat = speed ~ -1 +
  Intercept(1) +
  SpeedLimit +
  field(loc, model = rspde_model_stat,
        replicate = data_rspde_bru_stat[["repl"]])

rspde_fit_stat <-
  bru(cmp_stat,
      data = data_rspde_bru_stat[["data"]],
      family = "gaussian",
      options = list(verbose = FALSE)
  )

2.2 SHOW THE RESULTS FOR THE STATIONARY MODEL

stat.time.fin <- Sys.time()
print(stat.time.fin - stat.time.ini)
## Time difference of 2.27779 mins
summary(rspde_fit_stat)
## inlabru version: 2.10.1.9007
## INLA version: 24.05.18-2
## Components:
## Intercept: main = linear(1), group = exchangeable(1L), replicate = iid(1L)
## SpeedLimit: main = linear(SpeedLimit), group = exchangeable(1L), replicate = iid(1L)
## field: main = cgeneric(loc), group = exchangeable(1L), replicate = iid(data_rspde_bru_stat[["repl"]])
## Likelihoods:
##   Family: 'gaussian'
##     Data class: 'metric_graph_data', 'list'
##     Predictor: speed ~ .
## Time used:
##     Pre = 0.627, Running = 19.1, Post = 4.32, Total = 24 
## Fixed effects:
##             mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## Intercept  0.035 0.153     -0.266    0.035      0.335 0.035   0
## SpeedLimit 0.955 0.009      0.938    0.955      0.972 0.955   0
## 
## Random effects:
##   Name     Model
##     field CGeneric
## 
## Model hyperparameters:
##                                           mean    sd 0.025quant 0.5quant
## Precision for the Gaussian observations 100.14 1.396      97.40   100.14
## Theta1 for field                          5.39 0.008       5.38     5.39
## Theta2 for field                         -8.07 0.018      -8.11    -8.06
##                                         0.975quant   mode
## Precision for the Gaussian observations     102.90 100.16
## Theta1 for field                              5.41   5.39
## Theta2 for field                             -8.04  -8.05
## 
## Deviance Information Criterion (DIC) ...............: -30255.43
## Deviance Information Criterion (DIC, saturated) ....: 41303.55
## Effective number of parameters .....................: 15457.13
## 
## Watanabe-Akaike information criterion (WAIC) ...: -33455.90
## Effective number of parameters .................: 9218.52
## 
## Marginal log-Likelihood:  -54298.15 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
fit.rspde = rspde.result(rspde_fit_stat, "field", rspde_model_stat)
summary(fit.rspde)
##                mean          sd  0.025quant    0.5quant  0.975quant        mode
## std.dev 2.20161e+02 1.70109e+00 2.17522e+02 2.19920e+02 2.24025e+02 2.19028e+02
## range   3.13880e-04 5.71950e-06 2.90242e-04 3.24908e-04 3.24908e-04 3.19394e-04

3 NONSTATIONARY MODEL

3.1 FIT THE NONSTATIONARY MODEL

init.vec.theta = c(fit.rspde$summary.log.std.dev$mode, 
                   fit.rspde$summary.log.range$mode, 
                   rep(0, (ncol(B.sigma)-3)))

rspde_model_nonstat <- rspde.metric_graph(sf_graph,
                                          B.sigma = B.sigma,
                                          B.range = B.range,
                                          parameterization = "matern",
                                          nu = nu)

data_rspde_bru_nonstat <- graph_data_rspde(rspde_model_nonstat,
                                           repl = ".all",
                                           loc_name = "loc")
cmp_nonstat = speed ~ -1 +
  Intercept(1) +
  SpeedLimit +
  field(loc, model = rspde_model_nonstat,
        replicate = data_rspde_bru_nonstat[["repl"]])

rspde_fit_nonstat <-
  bru(cmp_nonstat,
      data = data_rspde_bru_nonstat[["data"]],
      family = "gaussian",
      options = list(verbose = FALSE)
  )

3.2 SHOW THE RESULTS FOR THE NONSTATIONARY MODEL

nonstat.time.fin <- Sys.time()
print(nonstat.time.fin - nonstat.time.ini)
## Time difference of 18.38253 mins
summary(rspde_fit_nonstat)
## inlabru version: 2.10.1.9007
## INLA version: 24.05.18-2
## Components:
## Intercept: main = linear(1), group = exchangeable(1L), replicate = iid(1L)
## SpeedLimit: main = linear(SpeedLimit), group = exchangeable(1L), replicate = iid(1L)
## field: main = cgeneric(loc), group = exchangeable(1L), replicate = iid(data_rspde_bru_nonstat[["repl"]])
## Likelihoods:
##   Family: 'gaussian'
##     Data class: 'metric_graph_data', 'list'
##     Predictor: speed ~ .
## Time used:
##     Pre = 0.387, Running = 23.2, Post = 2.98, Total = 26.6 
## Fixed effects:
##              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## Intercept  -0.001 0.007     -0.015   -0.001      0.014 -0.001   0
## SpeedLimit  0.953 0.004      0.945    0.953      0.961  0.953   0
## 
## Random effects:
##   Name     Model
##     field CGeneric
## 
## Model hyperparameters:
##                                            mean    sd 0.025quant 0.5quant
## Precision for the Gaussian observations 100.265 1.454     97.102  100.367
## Theta1 for field                          1.475 0.023      1.429    1.476
## Theta2 for field                          0.524 0.047      0.427    0.526
## Theta3 for field                          1.057 0.025      1.015    1.055
## Theta4 for field                         -0.347 0.051     -0.432   -0.351
##                                         0.975quant    mode
## Precision for the Gaussian observations    102.749 100.915
## Theta1 for field                             1.519   1.479
## Theta2 for field                             0.614   0.532
## Theta3 for field                             1.112   1.044
## Theta4 for field                            -0.234  -0.372
## 
## Deviance Information Criterion (DIC) ...............: -32216.17
## Deviance Information Criterion (DIC, saturated) ....: 39358.89
## Effective number of parameters .....................: 13463.75
## 
## Watanabe-Akaike information criterion (WAIC) ...: -34313.36
## Effective number of parameters .................: 8649.63
## 
## Marginal log-Likelihood:  -13190.00 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
summary(rspde.result(rspde_fit_nonstat, "field", rspde_model_nonstat))
##                    mean        sd 0.025quant  0.5quant 0.975quant      mode
## Theta1.matern  1.475480 0.0229099   1.428510  1.476120   1.518630  1.478960
## Theta2.matern  0.524409 0.0474374   0.427138  0.525725   0.613737  0.531623
## Theta3.matern  1.056740 0.0251363   1.014760  1.054640   1.112240  1.044220
## Theta4.matern -0.346695 0.0510541  -0.431910 -0.350967  -0.233932 -0.372196

4 CROSS-VALIDATION

#load(here("Models_output/distmatrixfixed.RData"))

points = data_final %>%
  as.data.frame() %>%
  mutate(., index = 1:nrow(.)) %>% 
  dplyr:::select(speed, .group, index) %>%
  mutate(.group = as.numeric(.group)) %>%
  group_by(.group) %>%
  mutate(indexingroup = seq_len(n())) %>%
  ungroup()

distance = seq(from = 0, to = 200, by = 20)/1000

The code of chunk below was executed only one time.


{r}
load(here("Models_output/distmatrixfixed_19May24.RData"))

points = data_final %>%
  as.data.frame() %>%
  mutate(., index = 1:nrow(.)) %>% 
  dplyr:::select(speed, .group, index) %>%
  mutate(.group = as.numeric(.group)) %>%
  group_by(.group) %>%
  mutate(indexingroup = seq_len(n())) %>%
  ungroup()

distance = seq(from = 0, to = 200, by = 20)/1000

GROUPS <- list()
for (j in 1:length(distance)) {
  print(j)
  GROUPS[[j]] = list()
  for (i in 1:nrow(points)) {
    rowi = points[i, ]
    which.in.group <- which(as.vector(distmatrixlist[[rowi$.group]][rowi$indexingroup,]) <= distance[j])
    GROUPS[[j]][[i]] <- filter(points, .group == rowi$.group)[which.in.group, ]$index
  }
}
save(GROUPS, file = here("Models_output/GROUPS_19May24_corrected.RData"))
load(here("Models_output/GROUPS_19May24_corrected.RData"))

The code of chunk above was executed only one time.


ggplot(data_final, aes(x = .coord_x, y = .coord_y, color = .group)) +
  geom_point(size = 0.1) +
  facet_wrap(~ .group) +
  theme_minimal() +
  labs(title = "Scatter Plot of Points by Groups",
       x = "X-axis Label",
       y = "Y-axis Label")

indexofinterest <- 21345
pointofinterest <- GROUPS[[11]][[indexofinterest]]
windowofinterest <- data_final[pointofinterest, ] |> as.data.frame() |> st_as_sf(coords = c(".coord_x", ".coord_y"), crs = 4326)

bbox <- st_bbox(windowofinterest)

polygon <- st_polygon(list(rbind(
  c(bbox["xmin"], bbox["ymin"]),
  c(bbox["xmax"], bbox["ymin"]),
  c(bbox["xmax"], bbox["ymax"]),
  c(bbox["xmin"], bbox["ymax"]),
  c(bbox["xmin"], bbox["ymin"])
))) |>
st_sfc(crs = st_crs(windowofinterest))

data_final_filtered <- data_final %>% 
  as.data.frame() %>%
  st_as_sf(coords = c(".coord_x", ".coord_y"), crs = 4326) %>%
  st_filter(x = ., y = polygon, .predicate = st_within)

ggplot() +
  geom_sf(data = data_final_filtered, aes(color = .group))


ggplot() +
  geom_sf(data = data_final_filtered, aes(color = .group)) +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")


ggplot() +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")


ggplot() +
  geom_point(data = filter(data_final, .group == data_final[indexofinterest, ]$.group), aes(x = .coord_x, y = .coord_y), color = "black") +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")

mse.stat <- mse.nonstat <- ls.stat <- ls.nonstat <- rep(0,length(distance))
# cross-validation for-loop
for (j in 1:length(distance)) {
  print(j)
  # cross-validation of the stationary model
  cv.stat <- inla.group.cv(rspde_fit_stat, groups = GROUPS[[j]])
  # cross-validation of the nonstationary model
  cv.nonstat <- inla.group.cv(rspde_fit_nonstat, groups = GROUPS[[j]])
  # obtain MSE and LS
  mse.stat[j] <- mean((cv.stat$mean - data_sim_nonstat$speed)^2)
  mse.nonstat[j] <- mean((cv.nonstat$mean - data_sim_nonstat$speed)^2)
  ls.stat[j] <- mean(log(cv.stat$cv))
  ls.nonstat[j] <- mean(log(cv.nonstat$cv))
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11

## plot results
par(mfrow = c(2,2), family = "Palatino")

# Plot MSE
plot(distance, mse.stat, main = "MSE", ylim = c(min(mse.nonstat, mse.stat), max(mse.nonstat, mse.stat)),
     type = "l", ylab = "MSE", xlab = "distance in m", col = "black")
lines(distance, mse.nonstat, col = "blue")
legend("bottomright", legend = c("Stationary", "Non-stationary"), col = c("black", "blue"), lty = 1)

# Plot log-score
plot(distance, -ls.stat, main = "log-score", ylim = c(min(-ls.nonstat, -ls.stat), max(-ls.nonstat, -ls.stat)),
     type = "l", ylab = "log-score", xlab = "distance in m", col = "black")
lines(distance, -ls.nonstat, col = "blue")
legend("bottomright", legend = c("Stationary", "Non-stationary"), col = c("black", "blue"), lty = 1)

finaltimeallprocedure <- Sys.time()
print(finaltimeallprocedure - timeallprocedure)
## Time difference of 23.07478 mins
save.image(here(paste0("Models_output/", rmarkdown::metadata$title, ".RData")))